import pandas as pd
import numpy as np # For mathematical calculations
import seaborn as sns # For data visualization
import matplotlib.pyplot as plt
import seaborn as sn # For plotting graphs
import io
%matplotlib inline
import warnings # To ignore any warnings
warnings.filterwarnings("ignore")
filepath2 = r"C:\Users\91623\OneDrive\Desktop\zomato project\final\data1.xlsx"
df= pd.read_excel(filepath2)
print(df)
NAME PRICE \
0 Hitchki 1200.0
1 Baba Falooda 400.0
2 Chin Chin Chu 1800.0
3 Butterfly High 1000.0
4 BKC DIVE 1200.0
... ... ...
11523 Hari Om Snack Bar 350.0
11524 PitaBurg 400.0
11525 Uncha Otlawala 300.0
11526 Mandarin Panda 400.0
11527 NaN NaN
CUSINE_CATEGORY \
0 Modern Indian,North Indian,Chinese,Momos,Birya...
1 Desserts,Ice Cream,Beverages
2 Asian,Chinese
3 Modern Indian
4 North Indian,Chinese,Continental
... ...
11523 Fast Food,South Indian,Chinese
11524 Fast Food,Lebanese
11525 Desserts,Ice Cream
11526 Desserts,Chinese,Thai
11527 NaN
REGION CUSINE TYPE \
0 First International Financial Centre-- Bandra ... Casual Dining
1 Mahim Dessert Parlor
2 Juhu Casual Dining
3 Bandra Kurla Complex Bar
4 Bandra Kurla Complex Bar
... ... ...
11523 Kandivali West Quick Bites
11524 Lower Parel none
11525 Kandivali West Dessert Parlor
11526 Malad West none
11527 NaN NaN
RATING_TYPE RATING VOTES
0 Excellent 4.9 3529
1 Very Good 4.4 1723
2 Very Good 4.2 337
3 Very Good 4.3 1200
4 Veľmi dobré 4.4 5995
... ... ... ...
11523 Good 3.7 64
11524 Average 3.4 99
11525 Good 3.5 29
11526 Good 3.7 121
11527 NaN NaN NaN
[11528 rows x 8 columns]
df.head()
| NAME | PRICE | CUSINE_CATEGORY | REGION | CUSINE TYPE | RATING_TYPE | RATING | VOTES | |
|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | Modern Indian,North Indian,Chinese,Momos,Birya... | First International Financial Centre-- Bandra ... | Casual Dining | Excellent | 4.9 | 3529 |
| 1 | Baba Falooda | 400.0 | Desserts,Ice Cream,Beverages | Mahim | Dessert Parlor | Very Good | 4.4 | 1723 |
| 2 | Chin Chin Chu | 1800.0 | Asian,Chinese | Juhu | Casual Dining | Very Good | 4.2 | 337 |
| 3 | Butterfly High | 1000.0 | Modern Indian | Bandra Kurla Complex | Bar | Very Good | 4.3 | 1200 |
| 4 | BKC DIVE | 1200.0 | North Indian,Chinese,Continental | Bandra Kurla Complex | Bar | Veľmi dobré | 4.4 | 5995 |
unique = [feature for feature in df.columns if len(df[feature].unique())>0 and len(df[feature].unique())<100]
for feature in unique:
print("{} has {} unique values : {} {}".format(feature,len(df[feature].unique()),df[feature].unique(),"\n"))
PRICE has 64 unique values : [1200. 400. 1800. 1000. 800. 1300. 1500. 600. 1400. 1100. 2000. 350. 900. 700. 500. 1600. 150. 300. 550. 450. 650. 750. 250. 200. 850. 100. 2500. 1450. 180. 950. 1700. 1900. 2600. 2400. 3000. 1250. 3200. 505. 2200. 1050. 1650. 1350. 50. 920. 1150. 1550. 3500. 398. 5000. 120. 160. 5. 480. 2100. 580. 1850. 220. 249. 2300. 0. 360. 248. 2700. nan] CUSINE TYPE has 23 unique values : ['Casual Dining' 'Dessert Parlor' 'Bar' 'Café' 'Quick Bites' 'none' 'Bakery' 'Sweet Shop' 'Food Court' 'Fine Dining' 'Beverage Shop' 'Pub' 'Food Truck' 'Dhaba' 'Lounge' 'Kiosk' 'Microbrewery' 'Paan Shop' 'Irani Cafe' 'Confectionery' 'Mess' 'Bhojanalya' nan] RATING_TYPE has 31 unique values : ['Excellent' 'Very Good' 'Veľmi dobré' 'Good' 'Velmi dobré' 'None' 'Average' 'Excelente' 'Muito Bom' 'Poor' 'Skvělá volba' 'Çok iyi' 'Baik' 'Bardzo dobrze' 'Bom' 'Média' 'Dobrze' 'Buono' 'İyi' 'Bueno' 'Ortalama' 'Skvělé' 'Biasa' 'Průměr' 'Sangat Baik' 'Priemer' 'Dobré' 'Promedio' 'Muy Bueno' 'Media' nan] RATING has 34 unique values : ['4.9' '4.4' '4.2' '4.3' '4.5' '4.7' '4.0' '4.6' '4.1' '3.9' '3.8' '3.6' 'NEW' '3.4' '3.0' '3.7' '4.8' '3.3' '3.5' '3.2' '3.1' '2.7' '2.5' '2.8' '2.6' '2.3' '2.9' '2.4' 'Opening' '2.1' '2.2' '1.8' '2.0' nan]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11528 entries, 0 to 11527 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 NAME 11527 non-null object 1 PRICE 11527 non-null float64 2 CUSINE_CATEGORY 11526 non-null object 3 REGION 11527 non-null object 4 CUSINE TYPE 11527 non-null object 5 RATING_TYPE 11527 non-null object 6 RATING 11527 non-null object 7 VOTES 11527 non-null object dtypes: float64(1), object(7) memory usage: 720.6+ KB
df.shape
(11528, 8)
df.columns = df.columns.str.lower()
df
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | |
|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | Modern Indian,North Indian,Chinese,Momos,Birya... | First International Financial Centre-- Bandra ... | Casual Dining | Excellent | 4.9 | 3529 |
| 1 | Baba Falooda | 400.0 | Desserts,Ice Cream,Beverages | Mahim | Dessert Parlor | Very Good | 4.4 | 1723 |
| 2 | Chin Chin Chu | 1800.0 | Asian,Chinese | Juhu | Casual Dining | Very Good | 4.2 | 337 |
| 3 | Butterfly High | 1000.0 | Modern Indian | Bandra Kurla Complex | Bar | Very Good | 4.3 | 1200 |
| 4 | BKC DIVE | 1200.0 | North Indian,Chinese,Continental | Bandra Kurla Complex | Bar | Veľmi dobré | 4.4 | 5995 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11523 | Hari Om Snack Bar | 350.0 | Fast Food,South Indian,Chinese | Kandivali West | Quick Bites | Good | 3.7 | 64 |
| 11524 | PitaBurg | 400.0 | Fast Food,Lebanese | Lower Parel | none | Average | 3.4 | 99 |
| 11525 | Uncha Otlawala | 300.0 | Desserts,Ice Cream | Kandivali West | Dessert Parlor | Good | 3.5 | 29 |
| 11526 | Mandarin Panda | 400.0 | Desserts,Chinese,Thai | Malad West | none | Good | 3.7 | 121 |
| 11527 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11528 rows × 8 columns
df.isnull().sum()
name 1 price 1 cusine_category 2 region 1 cusine type 1 rating_type 1 rating 1 votes 1 dtype: int64
df= df.dropna()
df.isnull().sum()
name 0 price 0 cusine_category 0 region 0 cusine type 0 rating_type 0 rating 0 votes 0 dtype: int64
unique = [feature for feature in df.columns if len(df[feature].unique())>0 and len(df[feature].unique())<100]
for feature in unique:
print("{} has {} unique values : {} {}".format(feature,len(df[feature].unique()),df[feature].unique(),"\n"))
price has 63 unique values : [1200. 400. 1800. 1000. 800. 1300. 1500. 600. 1400. 1100. 2000. 350. 900. 700. 500. 1600. 150. 300. 550. 450. 650. 750. 250. 200. 850. 100. 2500. 1450. 180. 950. 1700. 1900. 2600. 2400. 3000. 1250. 3200. 505. 2200. 1050. 1650. 1350. 50. 920. 1150. 1550. 3500. 398. 5000. 120. 160. 5. 480. 2100. 580. 1850. 220. 249. 2300. 0. 360. 248. 2700.] cusine type has 22 unique values : ['Casual Dining' 'Dessert Parlor' 'Bar' 'Café' 'Quick Bites' 'none' 'Bakery' 'Sweet Shop' 'Food Court' 'Fine Dining' 'Beverage Shop' 'Pub' 'Food Truck' 'Dhaba' 'Lounge' 'Kiosk' 'Microbrewery' 'Paan Shop' 'Irani Cafe' 'Confectionery' 'Mess' 'Bhojanalya'] rating_type has 30 unique values : ['Excellent' 'Very Good' 'Veľmi dobré' 'Good' 'Velmi dobré' 'None' 'Average' 'Excelente' 'Muito Bom' 'Poor' 'Skvělá volba' 'Çok iyi' 'Baik' 'Bardzo dobrze' 'Bom' 'Média' 'Dobrze' 'Buono' 'İyi' 'Bueno' 'Ortalama' 'Skvělé' 'Biasa' 'Průměr' 'Sangat Baik' 'Priemer' 'Dobré' 'Promedio' 'Muy Bueno' 'Media'] rating has 33 unique values : ['4.9' '4.4' '4.2' '4.3' '4.5' '4.7' '4.0' '4.6' '4.1' '3.9' '3.8' '3.6' 'NEW' '3.4' '3.0' '3.7' '4.8' '3.3' '3.5' '3.2' '3.1' '2.7' '2.5' '2.8' '2.6' '2.3' '2.9' '2.4' 'Opening' '2.1' '2.2' '1.8' '2.0']
df['rating_type'].replace(to_replace='Excelente' , value='Excellent', inplace=True)
df['rating_type'].replace(to_replace=['Veľmi dobré','Bardzo dobrze','Muy Bueno','Velmi dobré'] , value='Very Good', inplace=True)
df['rating_type'].replace(to_replace=['Skvělá volba','Dobrze','Bueno','Buono','Dobré','Bom','Skvělé'] , value='Good', inplace=True)
df['rating_type'].replace(to_replace=['Priemer','Média','Çok iyi'] , value='Average', inplace=True)
df['rating_type'].replace(to_replace=['Průměr','Promedio','Ortalama','Muito Bom','İyi'] , value='Poor', inplace=True)
df['rating_type'].replace(to_replace=['Baik','Biasa','Media','Sangat Baik'] , value='Very Poor', inplace=True)
df
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | |
|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | Modern Indian,North Indian,Chinese,Momos,Birya... | First International Financial Centre-- Bandra ... | Casual Dining | Excellent | 4.9 | 3529 |
| 1 | Baba Falooda | 400.0 | Desserts,Ice Cream,Beverages | Mahim | Dessert Parlor | Very Good | 4.4 | 1723 |
| 2 | Chin Chin Chu | 1800.0 | Asian,Chinese | Juhu | Casual Dining | Very Good | 4.2 | 337 |
| 3 | Butterfly High | 1000.0 | Modern Indian | Bandra Kurla Complex | Bar | Very Good | 4.3 | 1200 |
| 4 | BKC DIVE | 1200.0 | North Indian,Chinese,Continental | Bandra Kurla Complex | Bar | Very Good | 4.4 | 5995 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11522 | Tirupati Balaji | 500.0 | Chinese,Fast Food,North Indian | Oshiwara-- Andheri West | Casual Dining | Good | 3.5 | 267 |
| 11523 | Hari Om Snack Bar | 350.0 | Fast Food,South Indian,Chinese | Kandivali West | Quick Bites | Good | 3.7 | 64 |
| 11524 | PitaBurg | 400.0 | Fast Food,Lebanese | Lower Parel | none | Average | 3.4 | 99 |
| 11525 | Uncha Otlawala | 300.0 | Desserts,Ice Cream | Kandivali West | Dessert Parlor | Good | 3.5 | 29 |
| 11526 | Mandarin Panda | 400.0 | Desserts,Chinese,Thai | Malad West | none | Good | 3.7 | 121 |
11526 rows × 8 columns
df.head()
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | |
|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | Modern Indian,North Indian,Chinese,Momos,Birya... | First International Financial Centre-- Bandra ... | Casual Dining | Excellent | 4.9 | 3529 |
| 1 | Baba Falooda | 400.0 | Desserts,Ice Cream,Beverages | Mahim | Dessert Parlor | Very Good | 4.4 | 1723 |
| 2 | Chin Chin Chu | 1800.0 | Asian,Chinese | Juhu | Casual Dining | Very Good | 4.2 | 337 |
| 3 | Butterfly High | 1000.0 | Modern Indian | Bandra Kurla Complex | Bar | Very Good | 4.3 | 1200 |
| 4 | BKC DIVE | 1200.0 | North Indian,Chinese,Continental | Bandra Kurla Complex | Bar | Very Good | 4.4 | 5995 |
unique = [feature for feature in df.columns if len(df[feature].unique())>0 and len(df[feature].unique())<100]
for feature in unique:
print("{} has {} unique values : {} {}".format(feature,len(df[feature].unique()),df[feature].unique(),"\n"))
price has 63 unique values : [1200. 400. 1800. 1000. 800. 1300. 1500. 600. 1400. 1100. 2000. 350. 900. 700. 500. 1600. 150. 300. 550. 450. 650. 750. 250. 200. 850. 100. 2500. 1450. 180. 950. 1700. 1900. 2600. 2400. 3000. 1250. 3200. 505. 2200. 1050. 1650. 1350. 50. 920. 1150. 1550. 3500. 398. 5000. 120. 160. 5. 480. 2100. 580. 1850. 220. 249. 2300. 0. 360. 248. 2700.] cusine type has 22 unique values : ['Casual Dining' 'Dessert Parlor' 'Bar' 'Café' 'Quick Bites' 'none' 'Bakery' 'Sweet Shop' 'Food Court' 'Fine Dining' 'Beverage Shop' 'Pub' 'Food Truck' 'Dhaba' 'Lounge' 'Kiosk' 'Microbrewery' 'Paan Shop' 'Irani Cafe' 'Confectionery' 'Mess' 'Bhojanalya'] rating_type has 7 unique values : ['Excellent' 'Very Good' 'Good' 'None' 'Average' 'Poor' 'Very Poor'] rating has 33 unique values : ['4.9' '4.4' '4.2' '4.3' '4.5' '4.7' '4.0' '4.6' '4.1' '3.9' '3.8' '3.6' 'NEW' '3.4' '3.0' '3.7' '4.8' '3.3' '3.5' '3.2' '3.1' '2.7' '2.5' '2.8' '2.6' '2.3' '2.9' '2.4' 'Opening' '2.1' '2.2' '1.8' '2.0']
df.columns = df.columns.str.lower()
df['region'] = df['region'].str.replace('[a-zA-Z].+-- ','',regex=True)
df['region'] = df['region'].str.replace(' West| west| East| east','',regex=True)
df.region.unique()
array(['Bandra Kurla Complex', 'Mahim', 'Juhu', 'Flea Bazaar Café',
'Marol', 'Andheri', 'Kamala Mills Compound', 'Dadar', 'Khar',
'Lower Parel', 'Bandra', 'Mumbai CST Area', 'Bhandup', 'Malad',
'Powai', 'Chembur', 'Goregaon', 'Vile Parle', 'CBD-Belapur',
'Borivali', 'Near Andheri Station', 'Vasai', 'Thane', 'Parel',
'Colaba', 'Nariman Point', 'Santacruz', 'Mulund', 'Kandivali',
'Mahakali', 'Airoli', 'Mira Road', 'Fort', 'Ghodbunder Road',
'Jogeshwari', 'Vashi', 'Ghatkopar', 'Sakinaka', '7 Andheri',
'Byculla', 'Kalyan', 'Bhayandar', 'Charni Road', 'Chandivali',
'Kurla', 'Mohammad Ali Road', 'Kharghar', 'Matunga', 'Worli',
'Dadar Shivaji Park', 'Azad Nagar', 'Ulhasnagar', '4 Bungalows',
'Kopar Khairane', 'Dahisar', 'Seawoods', 'Mumbai Central',
'Veera Desai Area', 'Chowpatty', 'Old Panvel', 'Sion', 'Tardeo',
'Mazgaon', 'Prabhadevi', 'Sanpada', 'Ghansoli', 'Virar', 'Girgaum',
'Mumbra', 'Marve', 'Kamothe', 'Chakala', 'Ulwe', 'Marine Lines',
'Mahalaxmi', 'Runwal Green', 'Nalasopara', 'Kalwa', 'Nerul',
'Grant Road', 'Breach Candy', 'New Panvel', 'Churchgate',
'Vikhroli', 'Kalbadevi', 'Dombivali', 'Kemps Corner',
'Malabar Hill', 'Turbhe', 'Kalamboli', 'Wadala', 'Alibaug',
'Peddar Road', 'Ambernath', 'Gorai', 'Majiwada', 'Sewri',
'CBD Belapur', 'Cuffe Parade', 'Girgaon Chowpatty', 'Panvel',
'Trombay'], dtype=object)
df['region'] = df['region'].str.replace('4 Bungalows|7 Andheri|Azad Nagar|Near Andheri Station|Veera Desai Area|Mahakali','Andheri',regex=True)
df['region'] = df['region'].str.replace('Bandra Kurla Complex','Bandra',regex=True)
df['region'] = df['region'].str.replace('CBD-Belapur','Belapur',regex=True)
df['region'] = df['region'].str.replace('Girgaon Chowpatty','Chowpatty',regex=True)
df['region'] = df['region'].str.replace('Dadar Shivaji Park','Dadar',regex=True)
df['region'] = df['region'].str.replace('Flea Bazaar Café|Kamala Mills Compound','Lower Parel',regex=True)
df['region'] = df['region'].str.replace('Runwal Green','Mulund',regex=True)
df['region'] = df['region'].str.replace('Mumbai CST Area','Mumbai Central',regex=True)
df['region'] = df['region'].str.replace('Kopar Khairane|Seawoods|Turbhe|Ulwe','Navi Mumbai',regex=True)
df['region'] = df['region'].str.replace('New Panvel|Old Panvel','Panvel',regex=True)
df['region'] = df['region'].str.replace('Kamothe','Sion',regex=True)
df['region'] = df['region'].str.replace('Ghodbunder Road|Majiwada','Thane',regex=True)
df.region.unique()
array(['Bandra', 'Mahim', 'Juhu', 'Lower Parel', 'Marol', 'Andheri',
'Dadar', 'Khar', 'Mumbai Central', 'Bhandup', 'Malad', 'Powai',
'Chembur', 'Goregaon', 'Vile Parle', 'Belapur', 'Borivali',
'Vasai', 'Thane', 'Parel', 'Colaba', 'Nariman Point', 'Santacruz',
'Mulund', 'Kandivali', 'Airoli', 'Mira Road', 'Fort', 'Jogeshwari',
'Vashi', 'Ghatkopar', 'Sakinaka', 'Byculla', 'Kalyan', 'Bhayandar',
'Charni Road', 'Chandivali', 'Kurla', 'Mohammad Ali Road',
'Kharghar', 'Matunga', 'Worli', 'Ulhasnagar', 'Navi Mumbai',
'Dahisar', 'Chowpatty', 'Panvel', 'Sion', 'Tardeo', 'Mazgaon',
'Prabhadevi', 'Sanpada', 'Ghansoli', 'Virar', 'Girgaum', 'Mumbra',
'Marve', 'Chakala', 'Marine Lines', 'Mahalaxmi', 'Nalasopara',
'Kalwa', 'Nerul', 'Grant Road', 'Breach Candy', 'Churchgate',
'Vikhroli', 'Kalbadevi', 'Dombivali', 'Kemps Corner',
'Malabar Hill', 'Kalamboli', 'Wadala', 'Alibaug', 'Peddar Road',
'Ambernath', 'Gorai', 'Sewri', 'CBD Belapur', 'Cuffe Parade',
'Trombay'], dtype=object)
df.isnull().sum()
name 0 price 0 cusine_category 0 region 0 cusine type 0 rating_type 0 rating 0 votes 0 dtype: int64
df['votes'].replace(to_replace=['-','new','opening'], value='0', inplace=True)
df.votes.unique()
array(['3529', '1723', '337', ..., '944', '861', '1249'], dtype=object)
df.head()
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | |
|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | Modern Indian,North Indian,Chinese,Momos,Birya... | Bandra | Casual Dining | Excellent | 4.9 | 3529 |
| 1 | Baba Falooda | 400.0 | Desserts,Ice Cream,Beverages | Mahim | Dessert Parlor | Very Good | 4.4 | 1723 |
| 2 | Chin Chin Chu | 1800.0 | Asian,Chinese | Juhu | Casual Dining | Very Good | 4.2 | 337 |
| 3 | Butterfly High | 1000.0 | Modern Indian | Bandra | Bar | Very Good | 4.3 | 1200 |
| 4 | BKC DIVE | 1200.0 | North Indian,Chinese,Continental | Bandra | Bar | Very Good | 4.4 | 5995 |
df.describe()
| price | |
|---|---|
| count | 11526.000000 |
| mean | 505.309301 |
| std | 309.650892 |
| min | 0.000000 |
| 25% | 300.000000 |
| 50% | 450.000000 |
| 75% | 600.000000 |
| max | 5000.000000 |
df['rating'].unique()
array(['4.9', '4.4', '4.2', '4.3', '4.5', '4.7', '4.0', '4.6', '4.1',
'3.9', '3.8', '3.6', 'NEW', '3.4', '3.0', '3.7', '4.8', '3.3',
'3.5', '3.2', '3.1', '2.7', '2.5', '2.8', '2.6', '2.3', '2.9',
'2.4', 'Opening', '2.1', '2.2', '1.8', '2.0'], dtype=object)
df['rating'].replace(to_replace=['-','NEW','Opening'], value='0', inplace=True)
df['votes'].replace(to_replace=['-','NEW','Opening'], value='0', inplace=True)
df['rating'].unique()
array(['4.9', '4.4', '4.2', '4.3', '4.5', '4.7', '4.0', '4.6', '4.1',
'3.9', '3.8', '3.6', '0', '3.4', '3.0', '3.7', '4.8', '3.3', '3.5',
'3.2', '3.1', '2.7', '2.5', '2.8', '2.6', '2.3', '2.9', '2.4',
'2.1', '2.2', '1.8', '2.0'], dtype=object)
df['rating']=df['rating'].astype(float)
df['votes']=df['votes'].astype(float)
df.describe()
| price | rating | votes | |
|---|---|---|---|
| count | 11526.000000 | 11526.000000 | 11526.000000 |
| mean | 505.309301 | 3.179108 | 162.204060 |
| std | 309.650892 | 1.043701 | 416.886777 |
| min | 0.000000 | 0.000000 | 0.000000 |
| 25% | 300.000000 | 3.100000 | 11.000000 |
| 50% | 450.000000 | 3.400000 | 37.000000 |
| 75% | 600.000000 | 3.700000 | 130.000000 |
| max | 5000.000000 | 4.900000 | 10217.000000 |
numerical_features = [feature for feature in df.columns if df[feature].dtype != "O"]
print("we have {} numerical features out of {}".format(len(numerical_features),len(df.columns)))
we have 3 numerical features out of 8
discrete_features = [feature for feature in numerical_features if len(df[feature].unique())<25]
print("we have {} discrete features out of {} total features".format(len(discrete_features),len(df.columns)))
we have 0 discrete features out of 8 total features
continuos_features = [feature for feature in numerical_features if feature not in discrete_features]
print("we have {} continuos features out of {} total features".format(len(continuos_features),len(df.columns)))
we have 3 continuos features out of 8 total features
fig,ax = plt.subplots(3,3,figsize=(20,12))
for variable, subplot in zip(continuos_features,ax.flatten()):
sns.histplot(df[variable],ax=subplot)
sns.boxplot(df['price'])
plt.title("price- Before outlier removal");
df = df.drop(df[df['price']>3000].index)
sns.boxplot(df['price'])
plt.title("price- after outlier removal");
sns.scatterplot(data =df['votes'])
plt.title("votes- Before outlier removal");
df = df.drop(df[df['votes']>6000].index)
sns.scatterplot(data =df['votes'])
plt.title("votes- Before outlier removal");
df = df.drop(df[df['votes']>4000].index)
sns.scatterplot(data =df['votes'])
plt.title("votes- Before outlier removal");
fig,ax = plt.subplots(3,3,figsize=(20,12))
for variable, subplot in zip(continuos_features,ax.flatten()):
sns.histplot(df[variable],ax=subplot,palette="cool")
import plotly.express as px
import plotly.graph_objects as go
fig = px.histogram(df, x='cusine type', color='cusine type',
title= 'No. of Restaurants by Cuisine Type',
labels={'CUSINE TYPE':'Cuisine Type'})
fig.show()
import plotly.express as px
import plotly.graph_objects as go
fig = px.histogram(df, x='cusine type', color='rating',
title= 'No. of Restaurants by Cuisine Type',
labels={'CUSINE TYPE':'Cuisine Type'})
fig.show()
df['rates'] = df['rating'].apply(np.floor)
df.head()
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | rates | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | Modern Indian,North Indian,Chinese,Momos,Birya... | Bandra | Casual Dining | Excellent | 4.9 | 3529.0 | 4.0 |
| 1 | Baba Falooda | 400.0 | Desserts,Ice Cream,Beverages | Mahim | Dessert Parlor | Very Good | 4.4 | 1723.0 | 4.0 |
| 2 | Chin Chin Chu | 1800.0 | Asian,Chinese | Juhu | Casual Dining | Very Good | 4.2 | 337.0 | 4.0 |
| 3 | Butterfly High | 1000.0 | Modern Indian | Bandra | Bar | Very Good | 4.3 | 1200.0 | 4.0 |
| 5 | Flea Bazaar Café | 800.0 | American,Asian,Street Food,North Indian,Luckno... | Lower Parel | Café | Very Good | 4.2 | 2042.0 | 4.0 |
import plotly.express as px
import plotly.graph_objects as go
fig = px.histogram(df, x='cusine type', color='rates',
title= 'No. of Restaurants by Cuisine Type',
labels={'CUSINE TYPE':'Cuisine Type'})
fig.show()
rating_type_df = df['rating_type'].value_counts().reset_index()
rating_type_df.rename(columns={'index':'RATING TYPE', 'RATING_TYPE':'COUNT OF RESTAURANTS'}, inplace=True)
rating_type_df
| RATING TYPE | rating_type | |
|---|---|---|
| 0 | Average | 4983 |
| 1 | Good | 4260 |
| 2 | Very Good | 1129 |
| 3 | None | 979 |
| 4 | Excellent | 93 |
| 5 | Poor | 56 |
| 6 | Very Poor | 4 |
fig = px.pie(rating_type_df, names='RATING TYPE', values='rating_type', color='rating_type',
title='Percentage of Restaurants by Rating Type').update_traces(textposition='inside', textinfo='percent+label')
fig.show()
seafood_df = df[df['cusine_category'].str.contains('Seafood')]
seafood_df.sort_values(by='rating',ascending=False).head(10)
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | rates | |
|---|---|---|---|---|---|---|---|---|---|
| 6172 | Thangabali | 1000.0 | Seafood,South Indian,Mangalorean,Andhra,Kerala | Khar | Bar | Excellent | 4.7 | 564.0 | 4.0 |
| 70 | Ceremonial Kitchen & Co | 1000.0 | Seafood,Maharashtrian,North Indian,Chinese | Thane | Casual Dining | Excellent | 4.6 | 350.0 | 4.0 |
| 10435 | Maharashtra Lunch Home | 600.0 | Maharashtrian,Malwani,Konkan,Seafood | Kharghar | Casual Dining | Excellent | 4.6 | 209.0 | 4.0 |
| 3139 | Peco Peco | 700.0 | Chinese,Seafood,Asian | Powai | none | Excellent | 4.5 | 497.0 | 4.0 |
| 10086 | Quarter Canteen | 1100.0 | North Indian,Seafood,Chinese | Bandra | Casual Dining | Excellent | 4.5 | 573.0 | 4.0 |
| 8511 | Pi Bar and Kitchen | 1600.0 | Continental,European,Italian,Seafood,Pizza,Des... | Andheri | Bar | Excellent | 4.5 | 2068.0 | 4.0 |
| 833 | The Harbour Bay - SeaFood Kitchen & Bar | 2400.0 | Seafood,Beverages | Bandra | Casual Dining | Excellent | 4.5 | 100.0 | 4.0 |
| 816 | Rajmanya- Seafood family restaurant | 800.0 | Maharashtrian,Konkan,Seafood | Vashi | Casual Dining | Excellent | 4.5 | 178.0 | 4.0 |
| 845 | Monis Bar and Restaurant | 1000.0 | North Indian,Chinese,Continental,Seafood,Bever... | Thane | Casual Dining | Very Good | 4.4 | 662.0 | 4.0 |
| 312 | Zesty Kitchen | 600.0 | North Indian,Maharashtrian,Konkan,Mangalorean,... | Chandivali | Casual Dining | Very Good | 4.4 | 191.0 | 4.0 |
foodtruck_df = df[df['cusine type'] == 'Food Truck']
foodtruck_df.sort_values(by='rating',ascending=False).head(2)
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | rates | |
|---|---|---|---|---|---|---|---|---|---|
| 241 | Dumpling Delights | 200.0 | Momos | Matunga | Food Truck | Very Good | 4.3 | 212.0 | 4.0 |
| 940 | Street Food Co. | 250.0 | Fast Food,Chinese | Virar | Food Truck | Very Good | 4.1 | 274.0 | 4.0 |
corrmat=df.corr()
top_corr=corrmat.index
plt.figure(figsize=(20,20))
g= sns.heatmap(df[top_corr].corr(),annot= True,cmap="RdYlGn")
pos_corrmat = df.corr()
filtereddf = pos_corrmat[((pos_corrmat >= .5) & (pos_corrmat !=1.000))]
plt.figure(figsize=(15,5))
sns.heatmap(filtereddf, annot=True, cmap="Reds")
plt.title('Positive correlation matrix')
plt.show()
neg_corrmat = df.corr()
filtereddf = neg_corrmat[((neg_corrmat <-.3) & (neg_corrmat !=1.000))]
plt.figure(figsize=(15,5))
sns.heatmap(filtereddf, annot=True, cmap="Reds")
plt.title('Negative correlation matrix')
plt.show()
sns.pairplot(df,hue='rating')
<seaborn.axisgrid.PairGrid at 0x1e9a94c7e50>
df1=df.drop(['name'],axis=1)
df1=df.drop(['rates'],axis=1)
df1
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | |
|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | Modern Indian,North Indian,Chinese,Momos,Birya... | Bandra | Casual Dining | Excellent | 4.9 | 3529.0 |
| 1 | Baba Falooda | 400.0 | Desserts,Ice Cream,Beverages | Mahim | Dessert Parlor | Very Good | 4.4 | 1723.0 |
| 2 | Chin Chin Chu | 1800.0 | Asian,Chinese | Juhu | Casual Dining | Very Good | 4.2 | 337.0 |
| 3 | Butterfly High | 1000.0 | Modern Indian | Bandra | Bar | Very Good | 4.3 | 1200.0 |
| 5 | Flea Bazaar Café | 800.0 | American,Asian,Street Food,North Indian,Luckno... | Lower Parel | Café | Very Good | 4.2 | 2042.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11522 | Tirupati Balaji | 500.0 | Chinese,Fast Food,North Indian | Andheri | Casual Dining | Good | 3.5 | 267.0 |
| 11523 | Hari Om Snack Bar | 350.0 | Fast Food,South Indian,Chinese | Kandivali | Quick Bites | Good | 3.7 | 64.0 |
| 11524 | PitaBurg | 400.0 | Fast Food,Lebanese | Lower Parel | none | Average | 3.4 | 99.0 |
| 11525 | Uncha Otlawala | 300.0 | Desserts,Ice Cream | Kandivali | Dessert Parlor | Good | 3.5 | 29.0 |
| 11526 | Mandarin Panda | 400.0 | Desserts,Chinese,Thai | Malad | none | Good | 3.7 | 121.0 |
11504 rows × 8 columns
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
list1=['cusine type','region','cusine_category','rating_type']
for i in list1:
df1[i]=le.fit_transform(df1[i])
df1.head()
| name | price | cusine_category | region | cusine type | rating_type | rating | votes | |
|---|---|---|---|---|---|---|---|---|
| 0 | Hitchki | 1200.0 | 1821 | 4 | 5 | 1 | 4.9 | 3529.0 |
| 1 | Baba Falooda | 400.0 | 1072 | 43 | 7 | 5 | 4.4 | 1723.0 |
| 2 | Chin Chin Chu | 1800.0 | 57 | 31 | 5 | 5 | 4.2 | 337.0 |
| 3 | Butterfly High | 1000.0 | 1815 | 4 | 1 | 5 | 4.3 | 1200.0 |
| 5 | Flea Bazaar Café | 800.0 | 2 | 41 | 4 | 5 | 4.2 | 2042.0 |
df1.head()
df1=df1.drop('name',axis=1)
df1.shape
(11504, 7)
df1.dropna(how='any',inplace=True)
df1.shape
(11504, 7)
f1=df1.drop(['rating'],axis=1)
f1.head()
| price | cusine_category | region | cusine type | rating_type | votes | |
|---|---|---|---|---|---|---|
| 0 | 1200.0 | 1821 | 4 | 5 | 1 | 3529.0 |
| 1 | 400.0 | 1072 | 43 | 7 | 5 | 1723.0 |
| 2 | 1800.0 | 57 | 31 | 5 | 5 | 337.0 |
| 3 | 1000.0 | 1815 | 4 | 1 | 5 | 1200.0 |
| 5 | 800.0 | 2 | 41 | 4 | 5 | 2042.0 |
f1.dtypes
price float64 cusine_category int32 region int32 cusine type int32 rating_type int32 votes float64 dtype: object
df['rating'].unique()
array([4.9, 4.4, 4.2, 4.3, 4.5, 4.7, 4. , 4.1, 3.9, 3.8, 3.6, 0. , 3.4,
3. , 4.6, 3.7, 4.8, 3.3, 3.5, 3.2, 3.1, 2.7, 2.5, 2.8, 2.6, 2.3,
2.9, 2.4, 2.1, 2.2, 1.8, 2. ])
label=df['rating'].values
label
array([4.9, 4.4, 4.2, ..., 3.4, 3.5, 3.7])
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
f1
| price | cusine_category | region | cusine type | rating_type | votes | |
|---|---|---|---|---|---|---|
| 0 | 1200.0 | 1821 | 4 | 5 | 1 | 3529.0 |
| 1 | 400.0 | 1072 | 43 | 7 | 5 | 1723.0 |
| 2 | 1800.0 | 57 | 31 | 5 | 5 | 337.0 |
| 3 | 1000.0 | 1815 | 4 | 1 | 5 | 1200.0 |
| 5 | 800.0 | 2 | 41 | 4 | 5 | 2042.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 11522 | 500.0 | 582 | 3 | 5 | 2 | 267.0 |
| 11523 | 350.0 | 1328 | 36 | 19 | 2 | 64.0 |
| 11524 | 400.0 | 1240 | 41 | 21 | 0 | 99.0 |
| 11525 | 300.0 | 1071 | 36 | 7 | 2 | 29.0 |
| 11526 | 400.0 | 1065 | 45 | 21 | 2 | 121.0 |
11504 rows × 6 columns
f1=StandardScaler().fit(f1).transform(f1)
f1
array([[ 2.30626764, 0.47160834, -1.39899677, -0.90583168, -0.32650302,
9.86040944],
[-0.34125157, -0.44329655, 0.17220868, -0.6414244 , 2.20327688,
4.58592752],
[ 4.29190705, -1.68312094, -0.31123915, -0.90583168, 2.20327688,
0.53806929],
...,
[-0.34125157, -0.23808424, 0.09163404, 1.20942659, -0.95894799,
-0.15701747],
[-0.67219147, -0.44451805, -0.10980256, -0.6414244 , 0.30594196,
-0.36145475],
[-0.34125157, -0.45184707, 0.25278332, 1.20942659, 0.30594196,
-0.09276575]])
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(f1,label,test_size=0.,random_state=22)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
((10353, 6), (1151, 6), (10353,), (1151,))
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg
LinearRegression()
lin_reg.fit(X_train,y_train)
LinearRegression()
print(lin_reg.score(X_train,y_train))
print(lin_reg.score(X_test,y_test))
0.1414453027302125 0.1490038246780161
lin_reg_pred=lin_reg.predict(X_test)
from sklearn.metrics import r2_score
lin_reg_score=r2_score(y_test,lin_reg_pred)*100
print("Accuracy score for LR :",lin_reg_score)
Accuracy score for LR : 14.90038246780161
from sklearn.linear_model import Ridge
rr=Ridge()
rr
Ridge()
rr.fit(X_train,y_train)
Ridge()
print(rr.score(X_train,y_train))
print(rr.score(X_test,y_test))
0.1414453009753005 0.14900268869513744
rr_pred=rr.predict(X_test)
from sklearn.metrics import r2_score
rr_score=r2_score(y_test,rr_pred)*100
print("Accuracy score for RidgeR :",rr_score)
Accuracy score for RidgeR : 14.900268869513745
from sklearn.linear_model import Lasso
lr=Lasso()
lr
Lasso()
lr.fit(X_train,y_train)
Lasso()
print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))
0.0 -0.00011737831711000624
lr_pred=lr.predict(X_test)
from sklearn.metrics import r2_score
lr_score=r2_score(y_test,lr_pred)*100
print("Accuracy score for LR :",lr_score)
Accuracy score for LR : -0.011737831711000624
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
rfr
RandomForestRegressor()
rfr.fit(X_train,y_train)
RandomForestRegressor()
print(rfr.score(X_train,y_train))
print(rfr.score(X_test,y_test))
0.9960612633562352 0.9747745012100493
rfr_pred=rfr.predict(X_test)
from sklearn.metrics import r2_score
rfr_score=r2_score(y_test,rfr_pred)*100
print("Accuracy score for LR :",rfr_score)
Accuracy score for LR : 97.47745012100492
from sklearn.svm import SVR
svr=SVR()
svr.fit(X_train,y_train)
SVR()
print(svr.score(X_train,y_train))
print(svr.score(X_test,y_test))
0.884537566795772 0.8932078931357496
svr_pred=svr.predict(X_test)
from sklearn.metrics import r2_score
svr_score=r2_score(y_test,svr_pred)*100
print("Accuracy score for LR :",svr_score)
Accuracy score for LR : 89.32078931357495
pd.DataFrame({"Model Names":['Linear Reg','Ridge Reg','Lasso Reg','RandomForest Reg','Support Vector Reg'],
"Accuracy socre":[lin_reg_score,rr_score,lr_score,rfr_score,svr_score]})
| Model Names | Accuracy socre | |
|---|---|---|
| 0 | Linear Reg | 14.900382 |
| 1 | Ridge Reg | 14.900269 |
| 2 | Lasso Reg | -0.011738 |
| 3 | RandomForest Reg | 97.477450 |
| 4 | Support Vector Reg | 89.320789 |
f1
array([[ 2.30626764, 0.47160834, -1.39899677, -0.90583168, -0.32650302,
9.86040944],
[-0.34125157, -0.44329655, 0.17220868, -0.6414244 , 2.20327688,
4.58592752],
[ 4.29190705, -1.68312094, -0.31123915, -0.90583168, 2.20327688,
0.53806929],
...,
[-0.34125157, -0.23808424, 0.09163404, 1.20942659, -0.95894799,
-0.15701747],
[-0.67219147, -0.44451805, -0.10980256, -0.6414244 , 0.30594196,
-0.36145475],
[-0.34125157, -0.45184707, 0.25278332, 1.20942659, 0.30594196,
-0.09276575]])
rfr.predict([[1.3,0.7,1,1,0.234,7]])
array([3.894])
sample=pd.DataFrame({"Actual Ratin":y_test,
"Predicted Rating":np.round(rfr_pred,2)})
sample
| Actual Ratin | Predicted Rating | |
|---|---|---|
| 0 | 2.5 | 3.08 |
| 1 | 3.2 | 3.28 |
| 2 | 3.4 | 3.28 |
| 3 | 3.1 | 3.08 |
| 4 | 3.7 | 3.66 |
| ... | ... | ... |
| 1146 | 3.2 | 3.11 |
| 1147 | 3.0 | 3.04 |
| 1148 | 3.8 | 3.68 |
| 1149 | 4.1 | 4.20 |
| 1150 | 3.6 | 3.65 |
1151 rows × 2 columns
sample.to_csv("Model Prediction Sample.csv",index=False)